Main Figures
Have a legend for all plots
legend_plot <- ggplot(metadata, aes(y = frac_bacprod, x = fraction, fill = fraction, shape = season)) +
geom_jitter(size = 3, width = 0.2) +
scale_fill_manual(values = fraction_colors, guide = guide_legend(override.aes = list(shape = 22, size = 4))) +
scale_shape_manual(values = season_shapes) +
theme(legend.position = "bottom", axis.title.x = element_blank(),
legend.title = element_blank())
season_legend <- get_legend(legend_plot)Figure 1
######################################################### Fraction ABUNDANCe
frac_abund_wilcox <- wilcox.test(log10(as.numeric(fraction_bac_abund)) ~ fraction, data = metadata)
frac_abund_wilcox##
## Wilcoxon rank sum test
##
## data: log10(as.numeric(fraction_bac_abund)) by fraction
## W = 0, p-value = 1.479e-06
## alternative hypothesis: true location shift is not equal to 0
metadata %>%
group_by(fraction) %>%
summarize(mean(as.numeric(fraction_bac_abund)))## # A tibble: 2 x 2
## fraction `mean(as.numeric(fraction_bac_abund))`
## <fctr> <dbl>
## 1 Particle 41168.88
## 2 Free 734522.25
# Make a data frame to draw significance line in boxplot (visually calculated)
dat1 <- data.frame(a = c(1.1,1.1,1.9,1.9), b = c(6.45,6.5,6.5,6.45)) # WholePart vs WholeFree
poster_a <-
ggplot(filter(metadata, norep_filter_name != "MOTEJ515"),
aes(y = log10(as.numeric(fraction_bac_abund)), x = fraction)) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
ylab("Log10(Bacterial Counts) \n (cells/mL)") +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
##### Particle vs free cell abundances
geom_path(data = dat1, aes(x = a, y = b), linetype = 1, color = "gray40") +
annotate("text", x=1.5, y=6.55, size = 8, color = "gray40", label= paste("***")) +
annotate("text", x=1.5, y=6.4, size = 3.5, color = "gray40",
label= paste("p =", round(frac_abund_wilcox$p.value, digits = 6))) +
theme(legend.position = "bottom",
legend.title = element_blank(),
axis.title.x = element_blank())
######################################################### TOTAL PRODUCTION
totprod_wilcox <- wilcox.test(frac_bacprod ~ fraction, data = metadata)
totprod_wilcox##
## Wilcoxon rank sum test
##
## data: frac_bacprod by fraction
## W = 33, p-value = 0.02418
## alternative hypothesis: true location shift is not equal to 0
metadata %>%
group_by(fraction) %>%
summarize(mean(frac_bacprod))## # A tibble: 2 x 2
## fraction `mean(frac_bacprod)`
## <fctr> <dbl>
## 1 Particle 9.958333
## 2 Free 24.058333
# Make a data frame to draw significance line in boxplot (visually calculated)
dat2 <- data.frame(a = c(1.1,1.1,1.9,1.9), b = c(71,72,72,71)) # WholePart vs WholeFree
poster_b <- ggplot(metadata, aes(y = frac_bacprod, x = fraction)) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
scale_fill_manual(values = fraction_colors, guide = FALSE) +
scale_shape_manual(values = season_shapes) +
scale_y_continuous(limits = c(0, 78), expand = c(0,0), breaks = seq(0, 75, by = 15)) +
ylab("Community Production \n (μgC/L/day)") +
##### Particle vs free bulk production
geom_path(data = dat2, aes(x = a, y = b), linetype = 1, color = "gray40") +
annotate("text", x=1.5, y=73, size = 8, color = "gray40", label= paste("*")) +
annotate("text", x=1.5, y=69, size = 3.5, color = "gray40",
label= paste("p =", round(totprod_wilcox$p.value, digits = 3))) +
theme(legend.position = "none",
axis.title.x = element_blank())
######################################################### TOTAL PRODUCTION
percellprod_wilcox <- wilcox.test(log10(fracprod_per_cell) ~ fraction, data = filter(metadata, norep_filter_name != "MOTEJ515"))
percellprod_wilcox##
## Wilcoxon rank sum test
##
## data: log10(fracprod_per_cell) by fraction
## W = 125, p-value = 6.656e-05
## alternative hypothesis: true location shift is not equal to 0
filter(metadata, norep_filter_name != "MOTEJ515") %>%
group_by(fraction) %>%
summarize(mean(fracprod_per_cell))## # A tibble: 2 x 2
## fraction `mean(fracprod_per_cell)`
## <fctr> <dbl>
## 1 Particle 4.816116e-07
## 2 Free 3.866798e-08
# Make a data frame to draw significance line in boxplot (visually calculated)
dat3 <- data.frame(a = c(1.1,1.1,1.9,1.9), b = c(-5.05,-5,-5,-5.05)) # WholePart vs WholeFree
poster_c <- ggplot(filter(metadata, norep_filter_name != "MOTEJ515"),
aes(y = log10(fracprod_per_cell), x = fraction)) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
scale_fill_manual(values = fraction_colors, guide = FALSE) +
scale_shape_manual(values = season_shapes) +
ylim(c(-8.5, -4.9)) +
ylab("Log10(Per-Capita Production) \n(μgC/cell/day)") +
##### Particle vs free per-cell production
geom_path(data = dat3, aes(x = a, y = b), linetype = 1, color = "gray40") +
annotate("text", x=1.5, y=-4.95, size = 8, color = "gray40", label= paste("***")) +
annotate("text", x=1.5, y=-5.15, size = 3.5, color = "gray40",
label= paste("p =", round(percellprod_wilcox$p.value, digits = 5))) +
theme(legend.position = "none",
axis.title.x = element_blank())
######## FIGURE 1
# legend
legend <- get_legend(poster_a)
row1_plots <- plot_grid(poster_a + theme(legend.position = "none"), poster_b, poster_c,
labels = c("A", "B", "C"),
ncol = 3, nrow = 1)
#fig_1 <-
plot_grid(row1_plots, season_legend,
ncol = 1, nrow = 2,
rel_heights = c(1, 0.05))Linear Regressions with Fraction Diversity
#################################### Bulk Measure Production ####################################
################### Richness ###################
summary(lm(frac_bacprod ~ mean, data = richness)) # All samples together##
## Call:
## lm(formula = frac_bacprod ~ mean, data = richness)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.057 -12.017 -5.654 9.256 46.605
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.501168 9.039579 2.047 0.0528 .
## mean -0.003335 0.018911 -0.176 0.8616
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.54 on 22 degrees of freedom
## Multiple R-squared: 0.001412, Adjusted R-squared: -0.04398
## F-statistic: 0.0311 on 1 and 22 DF, p-value: 0.8616
# Particle-Associated
lm_prod_vs_rich_PA <- lm(frac_bacprod ~ mean, data = filter(richness, fraction == "Particle"))
summary(lm_prod_vs_rich_PA)##
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(richness, fraction ==
## "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.1271 -2.8865 -0.4649 3.7537 9.8401
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11.269531 5.717450 -1.971 0.07701 .
## mean 0.038125 0.009868 3.863 0.00314 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.476 on 10 degrees of freedom
## Multiple R-squared: 0.5988, Adjusted R-squared: 0.5587
## F-statistic: 14.93 on 1 and 10 DF, p-value: 0.003143
# Cross validate particle-associated for a better estimate of the adjusted R-squared
cv_lm_prod_vs_rich_PA <- train(
frac_bacprod ~ mean, data = filter(richness, fraction == "Particle"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_prod_vs_rich_PA$results # Particle-Associated CV results ## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 6.714924 0.6410184 5.362125 2.214199 0.2966807 1.819534
summary(lm(frac_bacprod ~ mean, data = filter(richness, fraction == "Free"))) # Free Living ##
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(richness, fraction ==
## "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.094 -11.112 -2.755 8.611 33.308
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.40793 21.62006 0.250 0.808
## mean 0.05511 0.06207 0.888 0.395
##
## Residual standard error: 17.7 on 10 degrees of freedom
## Multiple R-squared: 0.07306, Adjusted R-squared: -0.01963
## F-statistic: 0.7882 on 1 and 10 DF, p-value: 0.3955
################### Shannon Entropy ###################
summary(lm(frac_bacprod ~ mean, data = shannon)) # All samples together##
## Call:
## lm(formula = frac_bacprod ~ mean, data = shannon)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.659 -11.878 -5.666 9.231 46.593
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.63373 26.71652 0.623 0.540
## mean 0.08797 6.22969 0.014 0.989
##
## Residual standard error: 15.55 on 22 degrees of freedom
## Multiple R-squared: 9.064e-06, Adjusted R-squared: -0.04545
## F-statistic: 0.0001994 on 1 and 22 DF, p-value: 0.9889
# Particle-Associated
lm_prod_vs_shannon_PA <- lm(frac_bacprod ~ mean, data = filter(shannon, fraction == "Particle"))
summary(lm_prod_vs_shannon_PA)##
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(shannon, fraction ==
## "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9022 -2.9150 -0.5875 1.6713 12.0544
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -40.320 14.007 -2.878 0.01643 *
## mean 11.089 3.068 3.614 0.00473 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.693 on 10 degrees of freedom
## Multiple R-squared: 0.5664, Adjusted R-squared: 0.5231
## F-statistic: 13.06 on 1 and 10 DF, p-value: 0.004734
# Cross validate particle-associated for a better estimate of the adjusted R-squared
cv_lm_prod_vs_shannon_PA <- train(
frac_bacprod ~ mean, data = filter(shannon, fraction == "Particle"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_prod_vs_shannon_PA$results # Particle-Associated CV results ## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 6.483534 0.6694728 5.092528 2.388309 0.2923481 1.952157
summary(lm(frac_bacprod ~ mean, data = filter(shannon, fraction == "Free"))) # Free Living ##
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(shannon, fraction ==
## "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.004 -10.708 -3.738 6.632 37.129
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -13.37 73.87 -0.181 0.860
## mean 9.40 18.50 0.508 0.622
##
## Residual standard error: 18.15 on 10 degrees of freedom
## Multiple R-squared: 0.02516, Adjusted R-squared: -0.07233
## F-statistic: 0.2581 on 1 and 10 DF, p-value: 0.6225
################### Inverse Simpson ###################
summary(lm(frac_bacprod ~ mean, data = invsimps)) # All samples together##
## Call:
## lm(formula = frac_bacprod ~ mean, data = invsimps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.269 -10.298 -4.916 5.866 46.452
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.1577 6.0225 2.019 0.0559 .
## mean 0.1629 0.1731 0.941 0.3570
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.25 on 22 degrees of freedom
## Multiple R-squared: 0.03868, Adjusted R-squared: -0.005019
## F-statistic: 0.8851 on 1 and 22 DF, p-value: 0.357
# Particle-Associated samples
lm_prod_vs_invsimps_PA <- lm(frac_bacprod ~ mean, data = filter(invsimps, fraction == "Particle"))
summary(lm_prod_vs_invsimps_PA)##
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(invsimps, fraction ==
## "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.5589 -2.1093 -0.1969 0.8752 7.8282
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.45199 2.43866 -0.185 0.85666
## mean 0.29344 0.05781 5.076 0.00048 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.571 on 10 degrees of freedom
## Multiple R-squared: 0.7204, Adjusted R-squared: 0.6925
## F-statistic: 25.77 on 1 and 10 DF, p-value: 0.0004804
# Cross validate particle-associated for a better estimate of the adjusted R-squared
cv_lm_prod_vs_invsimps_PA <- train(
frac_bacprod ~ mean, data = filter(invsimps, fraction == "Particle"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_prod_vs_invsimps_PA$results # Cross Validated PA results## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 5.181725 0.7603274 3.923763 2.163189 0.2516618 1.792041
#Free Living Samples
summary(lm(frac_bacprod ~ mean, data = filter(invsimps, fraction == "Free")))##
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(invsimps, fraction ==
## "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.765 -9.356 -4.445 6.116 36.017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.0985 16.7052 0.664 0.521
## mean 0.5379 0.6598 0.815 0.434
##
## Residual standard error: 17.8 on 10 degrees of freedom
## Multiple R-squared: 0.06233, Adjusted R-squared: -0.03143
## F-statistic: 0.6648 on 1 and 10 DF, p-value: 0.4339
################### Simpson's Evenness ###################
summary(lm(frac_bacprod ~ mean, data = simpseven)) # All samples together##
## Call:
## lm(formula = frac_bacprod ~ mean, data = simpseven)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.000 -7.163 -3.815 3.392 45.845
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.8524 9.2286 -0.092 0.9272
## mean 274.1606 134.4253 2.040 0.0536 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.26 on 22 degrees of freedom
## Multiple R-squared: 0.159, Adjusted R-squared: 0.1208
## F-statistic: 4.16 on 1 and 22 DF, p-value: 0.05359
# Particle-Associated
lm_prod_vs_simpseven_PA <- lm(frac_bacprod ~ mean, data = filter(simpseven, fraction == "Particle"))
summary(lm_prod_vs_simpseven_PA)##
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(simpseven, fraction ==
## "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.973 -2.229 -1.086 1.356 12.380
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.829 4.539 -0.844 0.41865
## mean 234.057 71.238 3.286 0.00821 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.995 on 10 degrees of freedom
## Multiple R-squared: 0.5191, Adjusted R-squared: 0.471
## F-statistic: 10.79 on 1 and 10 DF, p-value: 0.008211
# Cross validate particle-associated for a better estimate of the adjusted R-squared
cv_lm_prod_vs_simpseven_PA <- train(
frac_bacprod ~ mean, data = filter(simpseven, fraction == "Particle"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_prod_vs_simpseven_PA$results # Particle-Associated CV results ## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 6.190526 0.6495025 4.726163 2.854309 0.304804 2.126732
summary(lm(frac_bacprod ~ mean, data = filter(simpseven, fraction == "Free"))) # Free Living ##
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(simpseven, fraction ==
## "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.389 -12.029 -3.306 4.668 39.946
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.85 23.52 0.674 0.516
## mean 114.96 321.02 0.358 0.728
##
## Residual standard error: 18.27 on 10 degrees of freedom
## Multiple R-squared: 0.01266, Adjusted R-squared: -0.08607
## F-statistic: 0.1282 on 1 and 10 DF, p-value: 0.7277
#################################### Per-Cell Production ####################################
################### Richness ###################
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = richness)) # All samples together ##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = richness)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8668 -0.2124 0.1045 0.2133 0.6473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.4591277 0.2212633 -38.231 < 2e-16 ***
## mean 0.0028988 0.0004641 6.246 3.4e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3804 on 21 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6501, Adjusted R-squared: 0.6334
## F-statistic: 39.02 on 1 and 21 DF, p-value: 3.395e-06
# Particle-Associated
lm_percell_prod_vs_rich_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, fraction == "Particle"))
summary(lm_percell_prod_vs_rich_PA)##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.55347 -0.21545 -0.01066 0.12536 0.58830
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.0617648 0.3717671 -21.685 4.44e-09 ***
## mean 0.0023794 0.0006348 3.748 0.00457 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3506 on 9 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6095, Adjusted R-squared: 0.5662
## F-statistic: 14.05 on 1 and 9 DF, p-value: 0.004567
# Cross validate particle-associated for a better estimate of the adjusted R-squared
cv_lm_percell_prod_vs_rich_PA <- train(
log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, fraction == "Particle" & norep_filter_name != "MOTEJ515"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_percell_prod_vs_rich_PA$results # Particle-Associated CV results ## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 0.4075388 0.6452733 0.3251794 0.142633 0.3137362 0.1268408
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, fraction == "Free"))) # Free Living ##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness,
## fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.71719 -0.13833 0.07155 0.23581 0.56949
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.135758 0.471253 -17.264 8.99e-09 ***
## mean 0.001657 0.001353 1.225 0.249
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3859 on 10 degrees of freedom
## Multiple R-squared: 0.1304, Adjusted R-squared: 0.04344
## F-statistic: 1.499 on 1 and 10 DF, p-value: 0.2488
################### Shannon Entropy ###################
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = shannon)) # All samples together ##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = shannon)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.03514 -0.15042 -0.03394 0.26568 0.82794
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10.9036 0.7534 -14.472 2.14e-12 ***
## mean 0.8805 0.1763 4.993 6.09e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4348 on 21 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.5428, Adjusted R-squared: 0.521
## F-statistic: 24.93 on 1 and 21 DF, p-value: 6.09e-05
# Particle-Associated
lm_percell_prod_vs_shannon_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(shannon, fraction == "Particle"))
summary(lm_percell_prod_vs_shannon_PA)##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(shannon,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.36686 -0.23571 -0.01330 0.03961 0.70312
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.8897 0.8820 -11.213 1.37e-06 ***
## mean 0.6993 0.1935 3.614 0.00562 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3584 on 9 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.5921, Adjusted R-squared: 0.5468
## F-statistic: 13.06 on 1 and 9 DF, p-value: 0.00562
# Cross validate particle-associated for a better estimate of the adjusted R-squared
cv_lm_percell_prod_vs_shannon_PA <- train(
log10(fracprod_per_cell_noinf) ~ mean, data = filter(shannon, fraction == "Particle" & norep_filter_name != "MOTEJ515"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_percell_prod_vs_shannon_PA$results # Particle-Associated CV results ## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 0.413426 0.6987126 0.3345857 0.1459863 0.3050087 0.1300826
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(shannon, fraction == "Free"))) # Free Living ##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(shannon,
## fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.72444 -0.17785 0.08114 0.13946 0.67366
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.8666 1.6332 -5.429 0.000289 ***
## mean 0.3243 0.4091 0.793 0.446298
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4014 on 10 degrees of freedom
## Multiple R-squared: 0.05914, Adjusted R-squared: -0.03495
## F-statistic: 0.6285 on 1 and 10 DF, p-value: 0.4463
################### Inverse Simpson ###################
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = invsimps)) # All samples together ##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = invsimps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.97302 -0.28199 -0.05285 0.32003 0.99088
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.86039 0.18153 -43.301 < 2e-16 ***
## mean 0.02355 0.00525 4.485 0.000204 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4596 on 21 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.4893, Adjusted R-squared: 0.4649
## F-statistic: 20.12 on 1 and 21 DF, p-value: 0.0002037
# Particle-Associated
lm_percell_prod_vs_invsimps_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, fraction == "Particle"))
summary(lm_percell_prod_vs_invsimps_PA)##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.28651 -0.18384 -0.11125 0.07337 0.56444
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.360961 0.159490 -46.153 5.27e-12 ***
## mean 0.018087 0.003759 4.812 0.000958 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2969 on 9 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.7201, Adjusted R-squared: 0.689
## F-statistic: 23.15 on 1 and 9 DF, p-value: 0.0009581
# Cross validate particle-associated for a better estimate of the adjusted R-squared
cv_lm_percell_prod_vs_invsimps_PA <- train(
log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, fraction == "Particle" & norep_filter_name != "MOTEJ515"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_percell_prod_vs_invsimps_PA$results # Particle-Associated CV results ## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 0.3569485 0.7164722 0.2874183 0.1139091 0.3273592 0.09448645
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, fraction == "Free"))) # Free Living ##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps,
## fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.68269 -0.11512 0.01325 0.17742 0.63175
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.03513 0.35685 -22.517 6.72e-10 ***
## mean 0.01910 0.01409 1.355 0.205
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3803 on 10 degrees of freedom
## Multiple R-squared: 0.1551, Adjusted R-squared: 0.07064
## F-statistic: 1.836 on 1 and 10 DF, p-value: 0.2052
################### Simpson's Evenness ###################
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = simpseven)) # All samples together ##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = simpseven)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.06874 -0.41920 0.04553 0.34511 1.56565
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.4496 0.4116 -18.10 2.74e-14 ***
## mean 4.3470 6.0344 0.72 0.479
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6353 on 21 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.02411, Adjusted R-squared: -0.02236
## F-statistic: 0.5189 on 1 and 21 DF, p-value: 0.4792
# Particle-Associated
lm_percell_prod_vs_simpseven_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(simpseven, fraction == "Particle"))
summary(lm_percell_prod_vs_simpseven_PA)##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(simpseven,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4726 -0.2230 -0.1106 0.1248 0.6887
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.5941 0.2885 -26.324 7.96e-10 ***
## mean 15.1887 4.6341 3.278 0.00957 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3788 on 9 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.4935
## F-statistic: 10.74 on 1 and 9 DF, p-value: 0.009566
# Cross validate particle-associated for a better estimate of the adjusted R-squared
cv_lm_percell_prod_vs_simpseven_PA <- train(
log10(fracprod_per_cell_noinf) ~ mean, data = filter(simpseven, fraction == "Particle" & norep_filter_name != "MOTEJ515"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_percell_prod_vs_simpseven_PA$results # Particle-Associated CV results ## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 0.4501359 0.6702939 0.362118 0.144351 0.316542 0.1181178
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(simpseven, fraction == "Free"))) # Free Living ##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(simpseven,
## fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.63112 -0.24216 0.01388 0.14672 0.77426
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.9274 0.5202 -15.240 3e-08 ***
## mean 4.9361 7.1008 0.695 0.503
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4041 on 10 degrees of freedom
## Multiple R-squared: 0.04609, Adjusted R-squared: -0.0493
## F-statistic: 0.4832 on 1 and 10 DF, p-value: 0.5028
R2 table
########## PUT A TABLE TOGETHER
# Per Liter Production
perliter_row1 <- c("Richness", "Per-Liter",
round(summary(lm_prod_vs_rich_PA)$adj.r.squared, digits = 3),
round(cv_lm_prod_vs_rich_PA$results$Rsquared, digits = 3),
round(cv_lm_prod_vs_rich_PA$results$RsquaredSD, digits = 3))
perliter_row2 <- c("Shannon_Entropy", "Per-Liter",
round(summary(lm_prod_vs_shannon_PA)$adj.r.squared, digits = 3),
round(cv_lm_prod_vs_shannon_PA$results$Rsquared, digits = 3),
round(cv_lm_prod_vs_shannon_PA$results$RsquaredSD, digits = 3))
perliter_row3 <- c("Inverse_Simpson", "Per-Liter",
round(summary(lm_prod_vs_invsimps_PA)$adj.r.squared, digits = 3),
round(cv_lm_prod_vs_invsimps_PA$results$Rsquared, digits = 3),
round(cv_lm_prod_vs_invsimps_PA$results$RsquaredSD, digits = 3))
perliter_row4 <- c("Simpsons_Evenness","Per-Liter",
round(summary(lm_prod_vs_simpseven_PA)$adj.r.squared, digits = 3),
round(cv_lm_prod_vs_simpseven_PA$results$Rsquared, digits = 3),
round(cv_lm_prod_vs_simpseven_PA$results$RsquaredSD, digits = 3))
# Per cell production
percell_row1 <- c("Richness", "Per-Cell",
round(summary(lm_percell_prod_vs_rich_PA)$adj.r.squared, digits = 3),
round(cv_lm_percell_prod_vs_rich_PA$results$Rsquared, digits = 3),
round(cv_lm_percell_prod_vs_rich_PA$results$RsquaredSD, digits = 3))
percell_row2 <- c("Shannon_Entropy", "Per-Cell",
round(summary(lm_percell_prod_vs_shannon_PA)$adj.r.squared, digits = 3),
round(cv_lm_percell_prod_vs_shannon_PA$results$Rsquared, digits = 3),
round(cv_lm_percell_prod_vs_shannon_PA$results$RsquaredSD, digits = 3))
percell_row3 <- c("Inverse_Simpson", "Per-Cell",
round(summary(lm_percell_prod_vs_invsimps_PA)$adj.r.squared, digits = 3),
round(cv_lm_percell_prod_vs_invsimps_PA$results$Rsquared, digits = 3),
round(cv_lm_percell_prod_vs_invsimps_PA$results$RsquaredSD, digits = 3))
percell_row4 <- c("Simpsons_Evenness", "Per-Cell",
round(summary(lm_percell_prod_vs_simpseven_PA)$adj.r.squared, digits = 3),
round(cv_lm_percell_prod_vs_simpseven_PA$results$Rsquared, digits = 3),
round(cv_lm_percell_prod_vs_simpseven_PA$results$RsquaredSD, digits = 3))
r2_table <- as.data.frame(rbind(perliter_row1, perliter_row2, perliter_row3, perliter_row4,
percell_row1, percell_row2, percell_row3, percell_row4))
colnames(r2_table) <- c("Diversity_Metric", "Production_Measure","Adjusted_R2","CV_R2", "CV_R2_SD")
row.names(r2_table) = NULL
pander(r2_table,
caption = "Supplemental Table 2: \n R2 estimates for heterotrophic production vs particle-associated diversity linear regressions.")| Diversity_Metric | Production_Measure | Adjusted_R2 | CV_R2 | CV_R2_SD |
|---|---|---|---|---|
| Richness | Per-Liter | 0.559 | 0.641 | 0.297 |
| Shannon_Entropy | Per-Liter | 0.523 | 0.669 | 0.292 |
| Inverse_Simpson | Per-Liter | 0.692 | 0.76 | 0.252 |
| Simpsons_Evenness | Per-Liter | 0.471 | 0.65 | 0.305 |
| Richness | Per-Cell | 0.566 | 0.645 | 0.314 |
| Shannon_Entropy | Per-Cell | 0.547 | 0.699 | 0.305 |
| Inverse_Simpson | Per-Cell | 0.689 | 0.716 | 0.327 |
| Simpsons_Evenness | Per-Cell | 0.493 | 0.67 | 0.317 |
Figure 2
######################################################### OBSERVED RICHNESS #########################################################
rich_fraction_wilcox <- wilcox.test(mean ~ fraction, data = richness)
rich_fraction_wilcox##
## Wilcoxon rank sum test
##
## data: mean by fraction
## W = 129, p-value = 0.0004955
## alternative hypothesis: true location shift is not equal to 0
filter(richness) %>%
group_by(fraction) %>%
summarize(mean(mean), sd(mean))## # A tibble: 2 x 3
## fraction `mean(mean)` `sd(mean)`
## <fctr> <dbl> <dbl>
## 1 Particle 556.7992 167.31404
## 2 Free 338.4242 85.98665
# Make a data frame to draw significance line in boxplot (visually calculated)
rich_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(920, 930, 930, 920)) # WholePart vs WholeFree
rich_distribution_plot <-
ggplot(richness, aes(y = mean, x = fraction)) +
scale_fill_manual(values = fraction_colors) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
scale_y_continuous(limits = c(150,980), breaks = seq(from = 0, to =925, by = 150)) +
xlab("Observed Richness") + xlab("Fraction") +
scale_shape_manual(values = season_shapes) +
geom_path(data = rich_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
annotate("text", x=1.5, y=980, size = 8, color = "#424645", label= paste("***"), angle = 90) +
annotate("text", x=1.5, y=790, size = 4, color = "#424645",
label= paste("p =", round(rich_fraction_wilcox$p.value, digits = 5))) +
theme(legend.position = "none",# axis.title.y = element_blank(),
axis.title.x = element_blank(), axis.text.x = element_blank()) +
coord_flip()
################ Richness vs Community-wide (Per-Liter) Production
## 1. Extract the R2 and p-value from the linear model:
lm_lab_perliter_rich_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_rich_PA)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_prod_vs_rich_PA)$coefficients[,4][2]), digits = 3), ")")
## 2. Plot Richness vs Community-wide (Per-Liter) Production
prod_vs_rich_plot <-
ggplot(richness, aes(x=mean, y=frac_bacprod)) +
geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) + # Y-axis errorbars
geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
scale_color_manual(values = fraction_colors) +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
ylab("\n Community Production \n (μgC/L/day)") +
xlab("Observed Richness") +
geom_smooth(data=filter(richness, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") +
scale_x_continuous(limits = c(150,980), breaks = seq(from = 0, to =925, by = 150)) +
# Add the R2 and p-value to the plot
annotate("text", x=790, y=45, label=lm_lab_perliter_rich_PA, parse = TRUE, color = "#FF6600", size = 4) +
theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank())
################ Richness vs Per Capita (Per-Cell) Production
## 1. Extract the R2 and p-value from the linear model:
lm_lab_percell_rich_PA <- paste("atop(R^2 ==", round(summary(lm_percell_prod_vs_rich_PA)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_percell_prod_vs_rich_PA)$coefficients[,4][2]), digits = 3), ")")
## 2. Plot Richness vs Per Capita (Per-Cell) Production
percell_prod_vs_rich_plot <-
ggplot(richness, aes(x=mean, y=log10(fracprod_per_cell_noinf))) +
geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
scale_color_manual(values = fraction_colors) +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
ylab("\n log10(Per-Capita Production)\n (μgC/cell/day)") +
xlab("Observed Richness") +
geom_smooth(data=filter(richness, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") +
scale_x_continuous(limits = c(150,980), breaks = seq(from = 0, to =925, by = 150)) +
#scale_y_continuous(limits = c(-8e-7,5e-6), breaks = seq(from = 0, to = 6e-7, by = 3e-7)) +
# Add the R2 and p-value to the plot
annotate("text", x=790, y=-7.5, label=lm_lab_percell_rich_PA, parse = TRUE, color = "#FF6600", size = 4) +
theme(legend.title = element_blank(), legend.position ="bottom",
legend.text = element_text(size = 14))
######################################################### INVERSE SIMPSON #########################################################
invsimps_fraction_wilcox <- wilcox.test(mean ~ fraction, data = invsimps)
invsimps_fraction_wilcox##
## Wilcoxon rank sum test
##
## data: mean by fraction
## W = 81, p-value = 0.6297
## alternative hypothesis: true location shift is not equal to 0
filter(invsimps) %>%
group_by(fraction) %>%
summarize(mean(mean), sd(mean))## # A tibble: 2 x 3
## fraction `mean(mean)` `sd(mean)`
## <fctr> <dbl> <dbl>
## 1 Particle 35.47659 23.843325
## 2 Free 24.09219 8.136901
# Make a data frame to draw significance line in boxplot (visually calculated)
invsimps_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(83,85,85,83)) # WholePart vs WholeFree
invsimps_distribution_plot <-
ggplot(invsimps, aes(y = mean, x = fraction)) +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
scale_y_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) +
ylab("Inverse Simpson") + xlab("Fraction") +
geom_path(data = invsimps_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
annotate("text", x=1.5, y=80, size = 4, color = "#424645", label= "NS") +
theme(legend.position = "none", #axis.title.y = element_blank(),
axis.title.x = element_blank(), axis.text.x = element_blank()) +
coord_flip()
################ Inverse Simpson vs Community-wide (Per-Liter) Production
## 1. Extract the R2 and p-value from the linear model:
lm_lab_perliter_invsimps_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_invsimps_PA)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_prod_vs_invsimps_PA)$coefficients[,4][2]), digits = 4), ")")
## 2. Plot Inverse Simpson vs Community-wide (Per-Liter) Production
prod_vs_invsimps_plot <-
ggplot(invsimps, aes(x=mean, y=frac_bacprod)) +
geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) + # Y-axis errorbars
geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
scale_color_manual(values = fraction_colors) +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
ylab("Community Production \n (μgC/L/day)") +
xlab("Inverse Simpson") +
geom_smooth(data=filter(invsimps, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") +
scale_x_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) +
# Add the R2 and p-value to the plot
annotate("text", x=70, y=45, label=lm_lab_perliter_invsimps_PA, parse = TRUE, color = "#FF6600", size = 4) +
theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank())
################ Inverse Simpson vs Per Capitra (Per-Cell) Production
## 1. Extract the R2 and p-value from the linear model:
lm_lab_percell_invsimps_PA <- paste("atop(R^2 ==", round(summary(lm_percell_prod_vs_invsimps_PA)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_percell_prod_vs_invsimps_PA)$coefficients[,4][2]), digits = 4), ")")
## 2. Plot Inverse Simpson vs Per Capitra (Per-Cell) Production
percell_prod_vs_invsimps_plot <-
ggplot(invsimps, aes(x=mean, y=log10(fracprod_per_cell_noinf), fill = fraction, color = fraction)) +
geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
geom_point(size = 3.5, color = "black", aes(shape = season)) +
scale_color_manual(values = fraction_colors, guide = TRUE) +
scale_fill_manual(values = fraction_colors, guide = FALSE) +
scale_shape_manual(values = season_shapes) +
ylab("log10(production/cell)\n (μgC/cell/day)") +
xlab("Inverse Simpson") +
geom_smooth(data=filter(invsimps, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") +
scale_x_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) +
#scale_y_continuous(limits = c(-8e-7,5e-6), breaks = seq(from = 0, to = 6e-7, by = 3e-7)) +
# Add the R2 and p-value to the plot
annotate("text", x=70, y=-7.5, label=lm_lab_percell_invsimps_PA, parse = TRUE, color = "#FF6600", size = 4) +
guides(color = guide_legend(override.aes = list(shape = 15))) +
theme(legend.title = element_blank(), legend.position ="bottom",
legend.text = element_text(size = 14))
# Plot Inverse simpson plots altogether
invsimps_plots <- plot_grid(invsimps_distribution_plot, prod_vs_invsimps_plot, percell_prod_vs_invsimps_plot,
labels = c("B", "D", "F"), ncol = 1, nrow = 3,
rel_heights = c(0.5, 0.8, 1.1),
align = "v")
# All 3 richness plots together
rich_plots <- plot_grid(rich_distribution_plot, prod_vs_rich_plot,
percell_prod_vs_rich_plot + theme(legend.position = "none"),
labels = c("A", "C", "E"), ncol = 1, nrow = 3,
rel_heights = c(0.5, 0.8, 1),
align = "v")
invsimps_plots_noyaxis <-
plot_grid(invsimps_distribution_plot + theme(axis.title.y = element_blank(), axis.text.y = element_blank()),
prod_vs_invsimps_plot + theme(axis.title.y = element_blank()),
percell_prod_vs_invsimps_plot + theme(axis.title.y = element_blank(), legend.position = "none"),
labels = c("B", "D", "F"), ncol = 1, nrow = 3,
rel_heights = c(0.5, 0.8, 1),
align = "v")
# Extract distribution plots of richness and inverse simpson
figure2_row1 <- plot_grid(rich_plots, invsimps_plots_noyaxis,
ncol = 2, nrow = 1, rel_widths = c(1, 0.75),
align = "h")
######## PLOT FIGURE 2
plot_grid(figure2_row1, season_legend,
ncol = 1, nrow = 2,
rel_heights = c(1, 0.05))Figure 3
Unweighted ses.mpd
######################################################### SES MPD DISTRIBUTION: UNWEIGHTED
################ Figure 3A: Distribution of Particle and Free Unweighted Mean Pairwise distance
## 1. Is there a difference between particle and free Unweighted Mean Pairwise distance?
unweighted_fraction_wilcox <- wilcox.test(mpd.obs.z ~ fraction, data = unweighted_df)
unweighted_fraction_wilcox##
## Wilcoxon rank sum test
##
## data: mpd.obs.z by fraction
## W = 30, p-value = 0.01449
## alternative hypothesis: true location shift is not equal to 0
# 2. What are the means of particle and free unweighted PD?
unweighted_df %>%
group_by(fraction) %>%
summarize(mean(mpd.obs.z))## # A tibble: 2 x 2
## fraction `mean(mpd.obs.z)`
## <fctr> <dbl>
## 1 Particle -0.4971164
## 2 Free 0.4375532
## 3. Plot Unweighted Phylogenetic Diversity vs Richness
# Make a data frame to draw significance line in boxplot (visually calculated)
dat4 <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(1.6,1.7,1.7,1.6)) # WholePart vs WholeFree
unweight_distribution_plot <-
ggplot(unweighted_df, aes(y = mpd.obs.z, x = fraction)) +
scale_fill_manual(values = fraction_colors) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
scale_shape_manual(values = season_shapes) +
scale_y_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) +
ylab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
xlab("Fraction") +
geom_hline(yintercept = 0, linetype = "dashed", size = 1.5) +
##### Particle vs free per-cell production
geom_path(data = dat4, aes(x = a, y = b), linetype = 1, color = "#424645") +
annotate("text", x=1.5, y=1.35, size = 4, color = "#424645",
label= paste("p =", round(unweighted_fraction_wilcox$p.value, digits = 2))) +
theme(legend.position = "none",# axis.title.y = element_blank(),
axis.title.x = element_blank(), axis.text.x = element_blank()) +
coord_flip()
################ Figure 3B: Unweighted Phylogenetic Diversity vs Richness
## 1. Is there a relationship between richness and Unweighted Mean Pairwise distance?
lm_richness_vs_unweight <- lm(mean ~ mpd.obs.z, data = unweighted_df)
# Linear model results for particle-associated only
summary(lm(mean ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Particle")))##
## Call:
## lm(formula = mean ~ mpd.obs.z, data = filter(unweighted_df, fraction ==
## "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -191.79 -112.16 -41.89 55.90 278.87
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 515.15 51.16 10.069 1.49e-06 ***
## mpd.obs.z -83.78 49.91 -1.679 0.124
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 155 on 10 degrees of freedom
## Multiple R-squared: 0.2199, Adjusted R-squared: 0.1419
## F-statistic: 2.818 on 1 and 10 DF, p-value: 0.1241
# Linear model results for free-living only
summary(lm(mean ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Free")))##
## Call:
## lm(formula = mean ~ mpd.obs.z, data = filter(unweighted_df, fraction ==
## "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -100.15 -66.22 -18.70 43.65 172.92
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 337.088 42.753 7.884 1.34e-05 ***
## mpd.obs.z 3.053 77.510 0.039 0.969
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 90.18 on 10 degrees of freedom
## Multiple R-squared: 0.0001551, Adjusted R-squared: -0.09983
## F-statistic: 0.001551 on 1 and 10 DF, p-value: 0.9694
## 2. Extract the R2 and p-value from the linear model:
lm_lab_richness_vs_unweight<- paste("atop(R^2 ==", round(summary(lm_richness_vs_unweight)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_richness_vs_unweight)$coefficients[,4][2]), digits = 3), ")")
## 3. Plot Unweighted Phylogenetic Diversity vs Richness
unweight_rich_vs_mpd_plot <-
ggplot(unweighted_df, aes(y = mean, x = mpd.obs.z)) +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
geom_point(size = 3, aes(fill = fraction, shape = season)) + ylab("\n Observed Richness") +
scale_shape_manual(values = season_shapes) +
xlab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
scale_fill_manual(values = fraction_colors) +
geom_smooth(method = "lm", color = "#424645", fill = "#424645", alpha = 0.3) +
scale_x_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) +
# Add the R2 and p-value to the plot
annotate("text", x=0.75, y=750, label=lm_lab_richness_vs_unweight, parse = TRUE, color = "#424645", size = 4) +
theme(legend.title = element_blank(), legend.position = "none",
axis.text.x = element_blank(), axis.title.x = element_blank())
################ Figure 3C: Unweighted Mean Pairwise distance vs Community-Wide Production
# Is there a relationship between Production and Unweighted Mean Pairwise distance?
summary(lm(frac_bacprod ~ mpd.obs.z, data = unweighted_df)) # NS ##
## Call:
## lm(formula = frac_bacprod ~ mpd.obs.z, data = unweighted_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.416 -9.547 -3.120 8.193 44.030
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.125 3.102 5.520 1.51e-05 ***
## mpd.obs.z 3.901 3.768 1.035 0.312
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.19 on 22 degrees of freedom
## Multiple R-squared: 0.04645, Adjusted R-squared: 0.003106
## F-statistic: 1.072 on 1 and 22 DF, p-value: 0.3118
# Linear model results for particle-associated only
summary(lm(frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Particle")))##
## Call:
## lm(formula = frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.145 -4.312 -1.824 3.402 19.527
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.213 2.617 3.138 0.0105 *
## mpd.obs.z -3.511 2.553 -1.376 0.1990
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.928 on 10 degrees of freedom
## Multiple R-squared: 0.1591, Adjusted R-squared: 0.07501
## F-statistic: 1.892 on 1 and 10 DF, p-value: 0.199
# Linear model results for free-living only
summary(lm(frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Free")))##
## Call:
## lm(formula = frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df,
## fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.996 -14.939 2.192 8.751 37.004
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.191 8.398 2.166 0.0555 .
## mpd.obs.z 13.410 15.225 0.881 0.3991
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.71 on 10 degrees of freedom
## Multiple R-squared: 0.072, Adjusted R-squared: -0.0208
## F-statistic: 0.7758 on 1 and 10 DF, p-value: 0.3991
unweight_prod_vs_mpd_plot <-
ggplot(unweighted_df, aes(y = frac_bacprod, x = mpd.obs.z, fill = fraction)) +
geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction), alpha = 0.7) + # X-axis errorbars
geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
geom_point(size = 3, aes(shape = season)) +
scale_shape_manual(values = season_shapes) +
ylab("\n Community Production \n (μgC/L/day)") +
xlab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
scale_fill_manual(values = fraction_colors) +
scale_x_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) +
theme(legend.title = element_blank(), legend.position = "none",
axis.text.x = element_blank(), axis.title.x = element_blank())
################ Figure 3D: Unweighted Phylogenetic Diversity vs Per Capitra (Per-Cell) Production
## 1. Run the linear model: Is there a relationship between PER-CELL PRODUCTION and Unweighted Mean Pairwise distance?
unweight_vs_percell <- lm(log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = unweighted_df)
# Linear model results for particle-associated only
summary(lm(log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Particle")))##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3795 -0.2419 -0.1651 0.0443 1.1815
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.9248 0.1711 -40.475 1.71e-11 ***
## mpd.obs.z -0.3299 0.1627 -2.028 0.0732 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4649 on 9 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3136, Adjusted R-squared: 0.2373
## F-statistic: 4.112 on 1 and 9 DF, p-value: 0.07319
# Linear model results for free-living only
summary(lm(log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Free")))##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df,
## fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.63978 -0.29782 0.07694 0.17610 0.76503
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.55622 0.19603 -38.545 3.3e-12 ***
## mpd.obs.z -0.04303 0.35540 -0.121 0.906
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4135 on 10 degrees of freedom
## Multiple R-squared: 0.001463, Adjusted R-squared: -0.09839
## F-statistic: 0.01466 on 1 and 10 DF, p-value: 0.906
## 2. Extract the R2 and p-value from the linear model:
lm_lab_percell_unweightPD <- paste("atop(R^2 ==", round(summary(unweight_vs_percell)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(unweight_vs_percell)$coefficients[,4][2]), digits = 4), ")")
## 3. Plot Unweighted Phylogenetic Diversity vs Per Capitra (Per-Cell) Production
unweight_percell_vs_mpd_plot <-
ggplot(unweighted_df,
aes(y = log10(fracprod_per_cell_noinf), x = mpd.obs.z)) +
geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
geom_point(size = 3, aes(fill = fraction, shape = season)) +
ylab("\n log10(Per-Capita Production)\n (μgC/cell/day)") +
xlab("Unweighted Phylogenetic Diversity") +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
geom_smooth(method = "lm", color="#424645", fill = "#424645", alpha = 0.3) +
scale_y_continuous(limits = c(-8.5,-5), breaks = c(-8, -7, -6)) +
scale_x_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) +
# Add the R2 and p-value to the plot
annotate("text", x=0.75, y=-6, label=lm_lab_percell_unweightPD, parse = TRUE, color = "#424645", size = 4) +
theme(legend.title = element_blank(), legend.position ="bottom",
legend.text = element_text(size = 14))
######## FIGURE 3
figure3_row1 <- plot_grid(unweight_distribution_plot, unweight_rich_vs_mpd_plot, unweight_prod_vs_mpd_plot,
unweight_percell_vs_mpd_plot + theme(legend.position = "none"),
labels = c("A", "B", "C", "D"), ncol = 1, nrow = 4,
rel_heights = c(0.5, 1, 1, 1.2),
align = "v")
plot_grid(figure3_row1, season_legend,
ncol = 1, nrow = 2,
rel_heights = c(1, 0.05))LASSO
Prepare the data for Ridge & Lasso Regressions
unweight_vals <- unweighted_df %>%
dplyr::select(mpd.obs.z, norep_filter_name) %>%
rename(Unweighted_PD = mpd.obs.z)
weighted_vals <- weighted_df %>%
dplyr::select(mpd.obs.z, norep_filter_name) %>%
rename(Weighted_PD = mpd.obs.z)
wide_all_divs <- otu_alphadiv %>%
dplyr::select(norep_filter_name ,mean, measure) %>%
spread(measure, mean)
lasso_data_df <- metadata_pca %>%
left_join(wide_all_divs, by = "norep_filter_name") %>%
left_join(unweight_vals, by = "norep_filter_name") %>%
left_join(weighted_vals, by = "norep_filter_name") %>%
dplyr::select(-c(project, year, Date, limnion, norep_water_name, dnaconcrep1,
SD_perc_attached_bacprod, SE_total_bac_abund, SE_perc_attached_abund, SE_attached_bac))
### PARTICLE DATA
lasso_data_df_particle <- lasso_data_df %>%
filter(fraction == "Particle" ) %>%
dplyr::select(-fraction)
lasso_data_df_particle_noprod <- lasso_data_df_particle %>%
dplyr::select(-c(fracprod_per_cell, fracprod_per_cell_noinf,
tot_bacprod, SD_tot_bacprod,SD_frac_bacprod,
norep_filter_name, BGA_cellspermL, Turb_NTU,
lakesite, season, station))
percell_lasso_data_df_particle_noprod <- lasso_data_df_particle %>%
dplyr::select(-c(fracprod_per_cell, frac_bacprod,
tot_bacprod, SD_tot_bacprod,SD_frac_bacprod,
lakesite, season, norep_filter_name, station, BGA_cellspermL, Turb_NTU)) %>%
dplyr::filter(Temp_C != 14.28) # Remove the missing row of data :( )
## ALL DATA
all_dat_lasso_comm <- lasso_data_df %>%
dplyr::select(-c(fracprod_per_cell, fracprod_per_cell_noinf,
tot_bacprod, SD_tot_bacprod,SD_frac_bacprod,
norep_filter_name, BGA_cellspermL, Turb_NTU,
lakesite, season, station, fraction))
all_dat_lasso_percapita <- lasso_data_df %>%
dplyr::select(-c(fracprod_per_cell, frac_bacprod,
tot_bacprod, SD_tot_bacprod,SD_frac_bacprod, fraction,
lakesite, season, norep_filter_name, station, BGA_cellspermL, Turb_NTU)) %>%
dplyr::filter(Temp_C != 14.28) # Remove the missing row of data :( )Perform Ridge and Lasso regression
Lasso: Bulk Community Production: PARTICLE
# Set the seed for reproducibility of the grid values
set.seed(777)
################ PREPARE DATA ################
# Subset data needed and scale all o
scaled_comm_data <-
lasso_data_df_particle_noprod %>% # Use data only for the particle samples
scale() %>% # Scale all of the variables to have mean =0 and sd = 1
as.data.frame() # Make it a dataframe so that model.matrix function works (does not take a matrix)
# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(frac_bacprod ~ ., scaled_comm_data)[,-1]
y = scaled_comm_data$frac_bacprod
grid = 10^seq(10,-2,length = 100)
################ PREPARE TRAINING & TESTING DATA FOR CROSS VALIDATION ################
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]
################ RIDGE ################
# Run RIDGE regression with alpha = 0
ridge_divs_train <- glmnet(x[train,], y[train], alpha = 0, lambda = grid, thresh = 1e-12, standardize = TRUE)
par(mfrow = c(1,2))
plot(ridge_divs_train)
# Cross validation
cv_ridge_divs <- cv.glmnet(x[train,], y[train], alpha = 0)
plot(cv_ridge_divs)best_ridge_lambda <- cv_ridge_divs$lambda.min
ridge_divs_pred <- predict(ridge_divs_train, s = best_ridge_lambda, newx = x[test,])
mean((ridge_divs_pred - y_test)^2) # Test MSE## [1] 1.104789
## Run ridge on the entire dataset
ridge_divs <- glmnet(x, y, alpha = 0, lambda = grid, standardize = TRUE)
par(mfrow = c(1,1))
plot(ridge_divs)################ LASSO ################
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = TRUE)
par(mfrow = c(1,2))
plot(lasso_divs_train)
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
plot(cv_lasso_divs)best_lasso_lambda <- cv_lasso_divs$lambda.min
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2)## [1] 1.405385
## Run lasso on the entire dataset with the best lambda value
lasso_divs <- glmnet(x, y, alpha = 1, lambda = grid, standardize = TRUE)
par(mfrow = c(1,1))
plot(lasso_divs)plot(lasso_divs, xvar = "lambda", label = TRUE)plot(lasso_divs, xvar = "dev", label = TRUE)# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)## 35 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -1.499450e-17
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH .
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## perc_attached_bacprod .
## fraction_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson 1.609355e-01
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
The test MSE for ridge regression is 1.1047887 while the test MSE for lasso is 1.4053855.
Additionally, the lasso model uses Inverse Simpson as the best and only predictor of production!
Lasso: Per capita Production: PARTICLE
scaled_percapita_data <- percell_lasso_data_df_particle_noprod %>%
dplyr::filter(!is.na(fracprod_per_cell_noinf)) %>%
mutate(log10_percell = log10(fracprod_per_cell_noinf)) %>%
dplyr::select(-c(fracprod_per_cell_noinf)) %>%
scale() %>%
as.data.frame()
set.seed(777)
# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(log10_percell ~ ., scaled_percapita_data)[,-1]
y = scaled_percapita_data$log10_percell
grid = 10^seq(10,-2,length = 100)
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]
################ RIDGE
# Run RIDGE regression with alpha = 0
ridge_divs_train <- glmnet(x[train,], y[train], alpha = 0, lambda = grid, thresh = 1e-12, standardize = FALSE)
par(mfrow = c(1,2))
plot(ridge_divs_train)
# Cross validation
cv_ridge_divs <- cv.glmnet(x[train,], y[train], alpha = 0)
plot(cv_ridge_divs)best_ridge_lambda <- cv_ridge_divs$lambda.min
ridge_divs_pred <- predict(ridge_divs_train, s = best_ridge_lambda, newx = x[test,])
mean((ridge_divs_pred - y_test)^2) # Test MSE## [1] 1.57874
## Run ridge on the entire dataset
ridge_divs <- glmnet(x, y, alpha = 0, lambda = grid, standardize = FALSE)
par(mfrow = c(1,1))
plot(ridge_divs)################ LASSO
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train)
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
plot(cv_lasso_divs)best_lasso_lambda <- cv_lasso_divs$lambda.min
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2)## [1] 1.743068
## Run lasso on the entire dataset
lasso_divs <- glmnet(x, y, alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,1))
plot(lasso_divs)plot(lasso_divs, xvar = "lambda", label = TRUE)plot(lasso_divs, xvar = "dev", label = TRUE)# What are the lasso coefficients?
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)## 35 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -7.117994e-16
## Sample_depth_m .
## Temp_C -1.378715e-01
## SpCond_uSpercm .
## TDS_mgperL .
## pH -3.662284e-03
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund -6.587419e-03
## attached_bac -3.907083e-02
## perc_attached_abund .
## perc_attached_bacprod .
## fraction_bac_abund -3.577572e-18
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 1.233827e-02
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson 4.347245e-01
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
Lasso: Community-wide Production: ALL SAMPLES
# Set the seed for reproducibility of the grid values
set.seed(777)
################ PREPARE DATA ################
# Subset data needed and scale all o
scaled_comm_data_ALL <-
all_dat_lasso_percapita %>% # Use data only for the particle samples
scale() %>% # Scale all of the variables to have mean =0 and sd = 1
as.data.frame() # Make it a dataframe so that model.matrix function works (does not take a matrix)
# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(frac_bacprod ~ ., scaled_comm_data_ALL)[,-1]## Error in eval(predvars, data, env): object 'frac_bacprod' not found
y = scaled_comm_data$frac_bacprod
grid = 10^seq(10,-2,length = 100)
################ PREPARE TRAINING & TESTING DATA FOR CROSS VALIDATION ################
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]
################ LASSO ################
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = TRUE)
par(mfrow = c(1,2))
plot(lasso_divs_train)
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
plot(cv_lasso_divs)best_lasso_lambda <- cv_lasso_divs$lambda.min
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2)## Error in mean((lasso_divs_pred - y_test)^2): dims [product 6] do not match the length of object [7]
## Run lasso on the entire dataset with the best lambda value
lasso_divs <- glmnet(x, y, alpha = 1, lambda = grid, standardize = TRUE)## Error in glmnet(x, y, alpha = 1, lambda = grid, standardize = TRUE): number of observations in y (12) not equal to the number of rows of x (11)
par(mfrow = c(1,1))
plot(lasso_divs)plot(lasso_divs, xvar = "lambda", label = TRUE)plot(lasso_divs, xvar = "dev", label = TRUE)# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)## 35 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -5.352278e-16
## Sample_depth_m .
## Temp_C -1.619342e-01
## SpCond_uSpercm .
## TDS_mgperL .
## pH -3.623885e-02
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund -7.539204e-02
## attached_bac -8.188182e-02
## perc_attached_abund .
## perc_attached_bacprod .
## fraction_bac_abund -3.174675e-03
## PC1 .
## PC2 .
## PC3 .
## PC4 1.594507e-02
## PC5 1.035121e-01
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson 4.266024e-01
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD -1.690375e-02
Lasso: Per-capita Production: ALL SAMPLES
set.seed(777)
scaled_percapita_ALL <- all_dat_lasso_percapita %>%
dplyr::filter(!is.na(fracprod_per_cell_noinf)) %>%
mutate(log10_percell = log10(fracprod_per_cell_noinf)) %>%
dplyr::select(-c(fracprod_per_cell_noinf, perc_attached_abund, fraction_bac_abund, attached_bac)) %>%
scale() %>%
as.data.frame()
# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(log10_percell ~ ., scaled_percapita_ALL)[,-1]
y = scaled_percapita_ALL$log10_percell
grid = 10^seq(10,-2,length = 100)
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]
################ RIDGE
# Run RIDGE regression with alpha = 0
ridge_divs_train <- glmnet(x[train,], y[train], alpha = 0, lambda = grid, thresh = 1e-12, standardize = TRUE)
par(mfrow = c(1,2))
plot(ridge_divs_train)
# Cross validation
cv_ridge_divs <- cv.glmnet(x[train,], y[train], alpha = 0)
plot(cv_ridge_divs)best_ridge_lambda <- cv_ridge_divs$lambda.min
ridge_divs_pred <- predict(ridge_divs_train, s = best_ridge_lambda, newx = x[test,])
mean((ridge_divs_pred - y_test)^2) # Test MSE## [1] 1.105553
## Run ridge on the entire dataset
ridge_divs <- glmnet(x, y, alpha = 0, lambda = grid, standardize = TRUE)
par(mfrow = c(1,1))
plot(ridge_divs)################ LASSO
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = TRUE)
par(mfrow = c(1,2))
plot(lasso_divs_train)
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
plot(cv_lasso_divs)best_lasso_lambda <- cv_lasso_divs$lambda.min
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2)## [1] 0.9141164
## Run lasso on the entire dataset
lasso_divs <- glmnet(x, y, alpha = 1, lambda = grid, standardize = TRUE)
par(mfrow = c(1,1))
plot(lasso_divs)plot(lasso_divs, xvar = "lambda", label = TRUE)plot(lasso_divs, xvar = "dev", label = TRUE)# What are the lasso coefficients?
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)## 32 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 5.315922e-16
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH -9.932255e-02
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL -3.544802e-04
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## perc_attached_bacprod .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness 4.317733e-01
## Shannon_Entropy .
## Inverse_Simpson .
## Simpsons_Evenness .
## Unweighted_PD -9.098229e-02
## Weighted_PD .